This notebook compares Larry’s exploratory forecaster for states against submissions to COVIDhub.
Get the dates we want forecasts for:
our_pred_dates <- get_covidhub_forecast_dates("CMU-TimeSeries")
n_dates <- length(our_pred_dates)
# n_dates - 4 is often the most recent date with ground truth for 4 weeks ahead
# dates spaced out 2 weeks ahead to see different behaviors
forecast_dates <- our_pred_dates[n_dates - 2 * 10:1]
Parameters for our forecaster that remain the same across forecast dates:
library(assertthat)
source("R/revised_larry.R")
aheads <- 1:4
lags <- c(0, 7, 14)
state_forecaster <- larrys_anteater
state_forecaster_name <- "larry's anteater"
# don't put an as_of column in the signals dataframe: this will be
# taken care of correctly by evalcast::get_predictions
# (i.e. as_of = the relevant forecast date)
state_forecaster_signals <- dplyr::tibble(
data_source = "jhu-csse",
signal = c("deaths_7dav_incidence_num",
"confirmed_7dav_incidence_num"
),
start_day = lubridate::ymd("2020-06-01"),
geo_type = "state"
)
state_forecaster_args <- list(
ahead = aheads,
lags = lags,
tau = evalcast::covidhub_probs(), # 23 quantiles
training_window_size = 90
)
State correction parameters (taken directly from production_params.R):
# state_corrections_params <- zookeeper::default_state_params(
# # many other options, see the function documentation
# data_source = state_forecaster_signals$data_source,
# signal = state_forecaster_signals$signal,
# geo_type = state_forecaster_signals$geo_type)
#
# state_corrector <- zookeeper::make_state_corrector(
# params = state_corrections_params,
# # data, locations, times to do special correction processing
# manual_flags = tibble::tibble(
# data_source = "jhu-csse",
# signal = c(rep("deaths_incidence_num", 3),
# "confirmed_incidence_num",
# ## from JHU-CSSE notes 2021-04-17, 2021-04-18
# "deaths_incidence_num",
# "deaths_incidence_num",
# "confirmed_incidence_num",
# "confirmed_incidence_num",
# ## ## from JHU-CSSE notes 2021-04-24, 2021-04-25
# ## "confirmed_incidence_num"
# ## from JHU-CSSE notes 2021-05-02
# "deaths_incidence_num"
# ),
# geo_value = c("va","ky","ok","ok",
# ## from JHU-CSSE notes 2021-04-17, 2021-04-18
# "ak","mi","mo","al",
# ## ## from JHU-CSSE notes 2021-04-24, 2021-04-25
# ## "al"
# ## from JHU-CSSE notes 2021-05-02
# "wv"
# ),
# time_value = list(
# seq(lubridate::ymd("2021-02-21"), lubridate::ymd("2021-03-04"), by = 1),
# lubridate::ymd(c("2021-03-18","2021-03-19")),
# lubridate::ymd("2021-04-07"),
# lubridate::ymd("2021-04-07"),
# ## from JHU-CSSE notes 2021-04-17, 2021-04-18
# lubridate::ymd("2021-04-15"),
# lubridate::ymd(c("2021-04-01", "2021-04-03", "2021-04-06", "2021-04-08", "2021-04-10", "2021-04-13", "2021-04-15", "2021-04-17",
# ## seems to be ongoing as of week of 2021-04-27
# "2021-04-20", "2021-04-22", "2021-04-24",
# "2021-04-27", "2021-04-29", "2021-05-01"
# )),
# lubridate::ymd("2021-04-17"),
# lubridate::ymd("2021-04-13","2021-04-20"),
# ## ## from JHU-CSSE notes 2021-04-24, 2021-04-25
# ## lubridate::ymd("2021-04-20")
# ## from JHU-CSSE notes 2021-05-02
# lubridate::ymd("2021-04-27")
# ),
# max_lag = c(rep(90, 4),
# ## from JHU-CSSE notes 2021-04-17, 2021-04-18
# 75, 150, 150, 180,
# ## from JHU-CSSE notes 2021-04-24, 2021-04-25
# ## as.integer(as.Date("2021-04-20") - as.Date("2020-10-23"))
# ## from JHU-CSSE notes 2021-05-02; just assign an arbitrary large value due to lack of accessible details
# 180
# )
# )
# )
Get the predictions for our forecaster:
state_predictions <- evalcast::get_predictions(
forecaster = state_forecaster,
name_of_forecaster = state_forecaster_name,
signals = state_forecaster_signals,
forecast_dates = forecast_dates,
incidence_period = "epiweek",
#apply_corrections = state_corrector,
forecaster_args = state_forecaster_args
)
Some post-processing for this forecaster:
state_predictions$value <- ifelse(state_predictions$value > 0,
state_predictions$value * 7,
0)
Get the competition and evaluate the predictions on the error metrics:
competition <- c("COVIDhub-ensemble", "COVIDhub-baseline",
"CMU-TimeSeries", "Karlen-pypm")
submitted <- lapply(competition[1:3], get_covidhub_predictions,
forecast_dates = forecast_dates,
signal = "deaths_incidence_num")
submitted[[4]] <- get_covidhub_predictions("Karlen-pypm",
forecast_dates = forecast_dates - 1,
signal = "deaths_incidence_num") %>%
mutate(forecast_date = forecast_date + 1)
submitted <- bind_rows(submitted) %>% filter(ahead < 5)
all_predictions <- bind_rows(state_predictions, submitted)
results <- evaluate_covid_predictions(all_predictions,
backfill_buffer = 0,
geo_type = "state") %>%
intersect_averagers(c("forecaster"), c("forecast_date", "geo_value"))
We compare the new forecaster to
NOTE: Results are based on the following numbers of common locations
results %>% group_by(forecast_date) %>% summarise(n_distinct(geo_value))
## # A tibble: 9 x 2
## forecast_date `n_distinct(geo_value)`
## * <date> <int>
## 1 2020-12-14 52
## 2 2020-12-28 52
## 3 2021-01-25 52
## 4 2021-02-08 52
## 5 2021-02-22 52
## 6 2021-03-08 52
## 7 2021-03-22 52
## 8 2021-04-05 52
## 9 2021-04-19 52
subtitle = sprintf("Forecasts made over %s to %s",
format(min(forecast_dates), "%B %d, %Y"),
format(max(forecast_dates), "%B %d, %Y"))
plot_canonical(results, x = "ahead", y = "ae", aggr = mean) +
labs(subtitle = subtitle, xlab = "Weeks ahead", ylab = "Mean AE") +
theme(legend.position = "bottom") +
scale_y_log10()
plot_canonical(results, x = "ahead", y = "wis", aggr = mean) +
labs(subtitle = subtitle, xlab = "Weeks ahead", ylab = "Mean WIS") +
theme(legend.position = "bottom") +
scale_y_log10()
plot_canonical(results, x = "ahead", y = "coverage_80", aggr = mean) +
labs(subtitle = subtitle, xlab = "Weeks ahead", ylab = "Mean Coverage") +
theme(legend.position = "bottom") +
coord_cartesian(ylim=c(0,1)) + geom_hline(yintercept = .8, color="black")
plot_canonical(results, x = "forecast_date", y = "ae", aggr = mean,
grp_vars = c("forecaster","ahead"), facet_cols = "ahead") +
labs(subtitle = subtitle, xlab = "forecast date", ylab = "Mean AE") +
theme(legend.position = "bottom") +
scale_y_log10()
plot_canonical(results, x = "forecast_date", y = "wis", aggr = mean,
grp_vars = c("forecaster","ahead"), facet_cols = "ahead") +
labs(subtitle = subtitle, xlab = "forecast date", ylab = "Mean WIS") +
theme(legend.position = "bottom") +
scale_y_log10()
plot_canonical(results, x = "forecast_date", y = "coverage_80", aggr = mean,
grp_vars = c("forecaster","ahead"), facet_cols = "ahead") +
labs(subtitle = subtitle, xlab = "forecast date", ylab = "Mean Coverage") +
theme(legend.position = "bottom") +
coord_cartesian(ylim=c(0,1)) + geom_hline(yintercept = .8, color="black")
Relative to baseline; scale first then take the median.
plot_canonical(results, x = "ahead", y = "wis", aggr = median,
base_forecaster = "COVIDhub-baseline", scale_before_aggr = TRUE) +
labs(subtitle = subtitle, xlab = "Weeks ahead", ylab = "Median relative WIS") +
theme(legend.position = "bottom") +
geom_hline(yintercept = 1)
plot_canonical(results, x = "forecast_date", y = "wis", aggr = median,
grp_vars = c("forecaster", "ahead"), facet_cols = "ahead",
base_forecaster = "COVIDhub-baseline", scale_before_aggr = TRUE) +
labs(subtitle = subtitle, xlab = "Forecast date", ylab = "Median relative WIS") +
theme(legend.position = "bottom") +
geom_hline(yintercept = 1)
Relative to baseline; scale first then take the geometric mean, ignoring a few 0’s. I think this is potentially more useful than the median/mean for relative WIS (or relative AE), but I haven’t completely thought it through. Putting the results here to be provocative.
geom_mean <- function(x) prod(x)^(1/length(x))
plot_canonical(results %>% filter(wis > 0), x = "ahead", y = "wis",
aggr = geom_mean,
base_forecaster = "COVIDhub-baseline", scale_before_aggr = TRUE) +
labs(subtitle = subtitle,
xlab = "Weeks ahead", ylab = "Mean (geometric) relative WIS") +
theme(legend.position = "bottom") +
geom_hline(yintercept = 1)
plot_canonical(results %>% filter(wis > 0), x = "forecast_date", y = "wis",
aggr = geom_mean, facet_cols = "ahead",
grp_vars = c("forecaster", "ahead"),
base_forecaster = "COVIDhub-baseline", scale_before_aggr = TRUE) +
theme(legend.position = "bottom") +
labs(subtitle = subtitle,
xlab = "Forecast date", ylab = "Mean (geometric) relative WIS") +
geom_hline(yintercept = 1)
plot_canonical(results, x = "target_end_date", y = "wis", aggr = mean,
dots = TRUE, grp_vars = "forecaster") +
labs(subtitle = subtitle, xlab = "Target date", ylab = "Mean WIS") +
theme(legend.position = "bottom") +
scale_y_log10()
plot_canonical(results, x = "target_end_date", y = "wis", aggr = mean,
dots = TRUE, grp_vars = c("forecaster", "ahead"),
facet_cols = "ahead") +
labs(subtitle = subtitle, xlab = "Target date", ylab = "Mean WIS") +
theme(legend.position = "bottom") +
scale_y_log10()
Note that the results are scaled by population.
maps <- results %>%
group_by(geo_value, forecaster) %>%
summarise(across(wis:ae, modeltools::Mean)) %>%
left_join(animalia::state_population, by = "geo_value") %>%
mutate(across(wis:ae, ~ .x / population * 1e5)) %>%
pivot_longer(wis:ae, names_to = "score") %>%
group_by(score) %>%
mutate(time_value = Sys.Date(),
r = modeltools::Max(value)) %>%
group_by(forecaster, .add = TRUE) %>%
group_split()
maps <- purrr::map(maps, ~as.covidcast_signal(
.x, signal = .x$score[1], data_source = .x$forecaster[1], geo_type = "state"))
maps <- purrr::map(maps,
~plot(.x, choro_col = scales::viridis_pal()(3),
range = c(0,.x$r[1])))
nfcasts <- length(unique(results$forecaster))
cowplot::plot_grid(plotlist = maps[1:nfcasts], ncol = 3)
cowplot::plot_grid(plotlist = maps[(nfcasts+1):length(maps)], ncol = 3)
pd <- evalcast:::setup_plot_trajectory(
bind_rows(state_predictions, submitted %>% filter(forecaster == "CMU-TimeSeries")),
intervals = 0.8,
geo_type = "state",
start_day = min(forecast_dates) - 60)
g <- ggplot(pd$truth_df, mapping = aes(x = target_end_date))
# build the fan
g <- g + geom_ribbon(
data = pd$quantiles_df,
mapping = aes(ymin = lower, ymax = upper, fill = forecaster,
group = interaction(forecaster,forecast_date)),
alpha = .1) +
scale_fill_viridis_d(begin=.15, end=.85)
# line layer
g <- g +
#geom_line(aes(y = .data$value.y), color = "#3182BD") + # corrected
geom_line(aes(y = value)) + # reported
geom_line(data = pd$points_df,
mapping = aes(y = value, color = forecaster,
group = interaction(forecaster,forecast_date)),
size = 1) +
geom_point(aes(y = value)) + # reported gets dots
geom_point(data = pd$points_df,
mapping = aes(y = value, color = forecaster),
size = 3) +
scale_color_viridis_d(begin=.15, end=.85)
g + theme_bw(base_size = 20) +
facet_wrap(~geo_value, scales = "free_y", ncol = 5) +
theme(legend.position = "top") + ylab("") + xlab("")